import requests as req;
import os;
import re;
import time;

def get_list_image(url):

    if(not os.path.exists("bug_file/images")):
        os.mkdir("bug_file/images");

    # UA伪装
    header = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
        "Mobile-User-Agent":"Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36"
    }

    # 获取到整个页面
    html_text = req.get(url=url,headers=header).text;

    '''
    <div class="thumb">
        <a href="/article/123995110" target="_blank">
            <img src="//pic.qiushibaike.com/system/pictures/12399/123995110/medium/1R6RY6LKQ890024Y.jpg" alt="糗事#123995110" class="illustration" width="100%" height="auto">
        </a>
    </div>
    '''

    # 需要根据html来写正则
    ex = '<div class="thumb">.*?<a.*?<img src="(.*?)" alt.*?</a>.*?</div>';
    srcs = re.findall(ex,html_text,re.S);

    # exx = '<img src="(.*?)"';
    # srcs = re.findall(exx,imgs[0],re.S);

    for item in srcs:
        src = "https:"+item;

        file_name = str(time.time()) + ".png";

        with open("bug_file/images/"+file_name,"wb") as wf:
            byte = req.get(url=src,headers=header).content;
            wf.write(byte);


    return len(srcs);



if __name__ == '__main__':

    num = 0;
    print("开始爬取糗事百科图片列表...");
    for index in range(1,14):
        print("爬取第%d次开始"%index);

        url = f"https://www.qiushibaike.com/imgrank/page/{index}/";
        num += get_list_image(url);


    print("爬取成功!!",f"本次共爬取图片{num}张");